Click to show code
print('hello')
#> [1] "hello"The following machine learning project focuses on…
print('hello')
#> [1] "hello"# Python code
import numpy as np
print(np.mean([10, 20, 30, 40, 50]))
#> 30.0properties <- read.csv(file.path(here(),"data/properties.csv"))
# show 1000 first rows of properties using reactable
reactable(head(properties, 1000))
# Create a tibble with cantons and observations
observations_table <- tibble(
Canton = c("Vaud", "Bern", "Lucerne", "Zurich", "Uri", "Schwyz",
"Obwalden", "Nidwalden", "Glarus", "St. Gallen", "Grisons",
"Aargau", "Thurgau", "Ticino", "Valais", "Neuchatel",
"Geneva", "Jura", "Zug", "Fribourg", "Solothurn",
"Basel-Stadt", "Basel-Landschaft", "Schaffhausen",
"Appenzell-Ausser-Rhoden", "Appenzell-Inner-Rhoden", "Total"),
Observations = c(3232, 1553, 376, 1191, 71, 93, 29, 51, 55, 757, 405,
1481, 553, 4230, 3601, 513, 629, 329, 69, 1242, 590,
149, 705, 118, 102, 12, sum(c(3232, 1553, 376, 1191, 71, 93, 29, 51, 55, 757, 405,
1481, 553, 4230, 3601, 513, 629, 329, 69, 1242, 590,
149, 705, 118, 102, 12)))
)
# Display the table using kable and kableExtra
observations_table %>%
kbl(caption = "Number of Observations by Canton") %>%
kable_styling(position = "center", bootstrap_options = c("striped", "bordered", "hover")) %>%
add_header_above(c(" " = 1, "Observations" = 1)) # Adds headers spanning columns| Canton | Observations |
|---|---|
| Vaud | 3232 |
| Bern | 1553 |
| Lucerne | 376 |
| Zurich | 1191 |
| Uri | 71 |
| Schwyz | 93 |
| Obwalden | 29 |
| Nidwalden | 51 |
| Glarus | 55 |
| St. Gallen | 757 |
| Grisons | 405 |
| Aargau | 1481 |
| Thurgau | 553 |
| Ticino | 4230 |
| Valais | 3601 |
| Neuchatel | 513 |
| Geneva | 629 |
| Jura | 329 |
| Zug | 69 |
| Fribourg | 1242 |
| Solothurn | 590 |
| Basel-Stadt | 149 |
| Basel-Landschaft | 705 |
| Schaffhausen | 118 |
| Appenzell-Ausser-Rhoden | 102 |
| Appenzell-Inner-Rhoden | 12 |
| Total | 22136 |
# Identify values causing the issue
problematic_values <- properties$number_of_rooms[is.na(as.numeric(properties$number_of_rooms))]
#> Warning: NAs introduced by coercion
# Replace non-numeric values with NA
#properties$number_of_rooms <- as.numeric(gsub("[^0-9.]", "", properties$number_of_rooms))
# Remove non-numeric characters and convert to numeric
properties$price <- as.numeric(gsub("[^0-9]", "", properties$price))
# Subset the dataset to exclude rows with price < 20000
properties <- properties[properties$price >= 20000, ]
# Subset the dataset to exclude rows with numbers of rooms < 25
#properties <- properties[properties$number_of_rooms <25, ]
# Replace incomplete addresses
properties$address <- gsub("^\\W*[.,0-]\\W*", "", properties$address)
properties_filtered <- na.omit(properties)
properties_filtered$year_category <- substr(properties_filtered$year_category, 1, 9)
# Assuming 'year_category' is a column in the 'properties' dataset
properties_filtered$year_category <- as.factor(properties_filtered$year_category)
# remove m^2 from column 'square_meters'
properties_filtered$square_meters <- as.numeric(gsub("\\D", "", properties_filtered$square_meters))
# print how many NA observations left in square_meters
print(sum(is.na(properties_filtered$square_meters)))
#> [1] 1056
# remove NA
properties_filtered <- properties_filtered[!is.na(properties_filtered$square_meters),]
# add majuscule to canton
properties_filtered$canton <- tools::toTitleCase(properties_filtered$canton)
# show 100 first row of cleaned dataset using reactable
reactable(head(properties_filtered, 100))
#filter properties_filtered to contains only 'price', 'number_of_rooms', 'square_meters'
properties_summary <- properties_filtered[, c('price', 'number_of_rooms', 'square_meters')]
#summary statistics
summary(properties_summary)
#> price number_of_rooms square_meters
#> Min. : 25000 Length:21076 Min. : 1
#> 1st Qu.: 690000 Class :character 1st Qu.: 99
#> Median : 995000 Mode :character Median : 137
#> Mean : 1355554 Mean : 160
#> 3rd Qu.: 1550000 3rd Qu.: 190
#> Max. :26149500 Max. :2700
# Data
data <- data.frame(
price = c(25000, 690000, 992340, 1348429, 1550000, 26149500),
number_of_rooms = c(1.0, 35.0, 45.0, 41.1, 55.0, 185.0),
square_meters = c(1, 98, 136, 159, 190, 2000)
)
# Summary statistics
summary_stats <- summary(data)
# Summary statistics
summary_stats <- cbind(
Min = apply(data, 2, min),
Q1 = apply(data, 2, quantile, probs = 0.25),
Median = apply(data, 2, median),
Mean = apply(data, 2, mean),
Q3 = apply(data, 2, quantile, probs = 0.75),
Max = apply(data, 2, max)
)
# Create table
table <- kbl(summary_stats, align = rep('c', 6), caption = "Summary statistics for the dataset") %>%
kable_styling(position = "center", bootstrap_options = c("striped", "hover", "condensed", "responsive"))
table| Min | Q1 | Median | Mean | Q3 | Max | |
|---|---|---|---|---|---|---|
| price | 25000 | 765585.0 | 1170385 | 5.13e+06 | 1.50e+06 | 26149500 |
| number_of_rooms | 1 | 36.5 | 43 | 6.04e+01 | 5.25e+01 | 185 |
| square_meters | 1 | 107.5 | 148 | 4.31e+02 | 1.82e+02 | 2000 |
The dataset described is the “Official Index of Localities” (Répertoire officiel des localités) provided by the Swiss Federal Office of Topography (swisstopo). It contains comprehensive information about all localities in Switzerland and the Principality of Liechtenstein, including their names, postal codes, and perimeters.
This dataset is regularly updated on a monthly basis, incorporating changes reported by cantonal authorities and Swiss Post. It serves various purposes, including spatial analysis, integration with other geographic datasets, usage as a geolocated background in GIS (Geographic Information Systems) and CAD (Computer-Aided Design) systems, statistical analysis, and as a reference dataset for information systems.
Updates and release notes for the dataset are provided periodically, detailing changes and improvements made over time. The Swiss Federal Office of Topography manages and distributes this dataset as part of its responsibilities in collecting and providing official geospatial data for Switzerland.
Source - swisstopo
df <- properties_filtered
#the address column is like : '1844 Villeneuve VD' and has zip code number in it
#taking out the zip code number and creating a new column 'zip_code'
#the way to identify the zip code is to identify numbers that are 4 digits long
df$zip_code <- as.numeric(gsub("\\D", "", df$address))
#removing the first two number of zip code has more than 4 number
df$zip_code <- ifelse(df$zip_code > 9999, df$zip_code %% 10000, df$zip_code)#read .csv AMTOVZ_CSV_LV95
amto <- read.csv(file.path(here(),"data/AMTOVZ_CSV_WGS84.csv"), sep = ";")
#creating a new dataframe with 'Ortschaftsname' as 'City'Place_name', 'PLZ' as 'zip_code', 'KantonskÃ.rzel' as 'Canton_code', 'E' as 'lon' and 'N' as 'lat'
amto_df <- amto[, c('Gemeindename', 'PLZ', 'Kantonskürzel', 'E', 'N')]
#renaming the columns
colnames(amto_df) <- c('Community', 'zip_code', 'Canton_code', 'lon', 'lat')
#remove duplicates of zip code
amto_df <- amto_df[!duplicated(amto_df$zip_code),]
#add the variable of amto_df to the df if the zip code matches
df <- merge(df, amto_df, by = "zip_code", all.x = TRUE)
#check if there are nan in city
df[is.na(df$Community),]
#> zip_code price number_of_rooms square_meters
#> 1 25 2200000 6.5 rooms 165
#> 2 25 2200000 10 rooms 263
#> 3 26 655000 3.5 rooms 66
#> 4 26 1995000 7.5 rooms 180
#> 5 322 880000 2.5 rooms 55
#> 6 322 975000 3.5 rooms 56
#> 7 322 870000 2.5 rooms 59
#> 230 1014 1510000 5.5 rooms 146
#> 1148 1200 3285450 5 rooms 230
#> 1149 1200 16092000 7 rooms 400
#> 1150 1200 679000 5.5 rooms 142
#> 5531 1919 785000 3.5 rooms 103
#> 5532 1919 1908000 6.5 rooms 210
#> 5533 1919 1065000 4.5 rooms 130
#> 5534 1919 2558620 5.5 rooms 270
#> 7694 2500 420000 4.5 rooms 115
#> 7695 2500 887500 4.5 rooms 144
#> 7696 2500 877500 4.5 rooms 138
#> 7697 2500 872500 4.5 rooms 138
#> 7698 2500 892500 4.5 rooms 144
#> 7699 2500 870500 4.5 rooms 125
#> 7700 2500 887500 5.5 rooms 130
#> 7701 2500 885500 5.5 rooms 130
#> 7702 2500 1100000 5 rooms 154
#> 7703 2500 885500 5.5 rooms 130
#> 7704 2500 872500 4.5 rooms 144
#> 7705 2500 1050000 4.5 rooms 121
#> 7706 2500 1450000 5.5 rooms 198
#> 8402 3000 920000 4.5 rooms 157
#> 8403 3000 1090000 5.5 rooms 193
#> 8404 3000 1140000 3.5 rooms 115
#> 8405 3000 820000 5.5 rooms 165
#> 8406 3000 720000 3.5 rooms 102
#> 8407 3000 1090000 5.5 rooms 193
#> 8408 3000 1090000 3.5 rooms 115
#> 8409 3000 1590000 5.5 rooms 330
#> 8410 3000 920000 4.5 rooms 157
#> 10532 4000 180000 3 rooms 70
#> 10533 4000 975000 4.5 rooms 125
#> 10534 4000 2100000 6.5 rooms 360
#> 12482 5201 725000 3.5 rooms 95
#> 13343 6000 695000 4.5 rooms 133
#> 14101 6511 440000 2 rooms 64
#> 14377 6547 15000000 7.5 rooms 220
#> 14698 6602 450000 3.5 rooms 75
#> 14699 6602 270000 1.5 rooms 28
#> 14700 6602 2800000 7.5 rooms 242
#> 14701 6602 2800000 6.5 rooms 250
#> 14702 6604 1990000 4.5 rooms 220
#> 14703 6604 760000 3.5 rooms 78
#> 14704 6604 2668590 5.5 rooms 290
#> 16725 6901 3660930 4.5 rooms 290
#> 16726 6901 3660930 4.5 rooms 290
#> 16727 6903 790000 3.5 rooms 105
#> 16728 6907 995000 4.5 rooms 114
#> 16729 6907 995000 4.5 rooms 114
#> 16730 6911 737550 4.5 rooms 82
#> 16731 6911 610000 3.5 rooms 103
#> 16732 6911 469350 5.5 rooms 140
#> 16733 6911 660000 7.5 rooms 200
#> 18049 7133 2266290 5.5 rooms 160
#> 18058 7135 2690000 8.5 rooms 236
#> 18323 8000 2100000 4.5 rooms 152
#> 18324 8000 1990000 5.5 rooms 200
#> 18325 8000 1650000 4.5 rooms 142
#> 18326 8000 1150000 4.5 rooms 128
#> 18327 8000 975000 4.5 rooms 122
#> 18328 8000 1450000 5.5 rooms 143
#> 18329 8000 2495000 5.5 rooms 482
#> 18330 8000 1650000 4.5 rooms 142
#> 18331 8000 925000 3.5 rooms 102
#> 18332 8000 1990000 5.5 rooms 200
#> 18816 8238 245000 2 rooms 49
#> 19242 8423 2190000 5.5 rooms 167
#> 19243 8423 2110000 6.5 rooms 204
#> 20467 9241 730840 5.5 rooms 130
#> 20468 9241 545000 4.5 rooms 100
#> address
#> 1 1000 Lausanne 25
#> 2 1000 Lausanne 25
#> 3 1000 Lausanne 26
#> 4 Lausanne 26, 1000 Lausanne 26
#> 5 7032 Laax GR 2
#> 6 Via Murschetg 29, 7032 Laax GR 2
#> 7 Via Cuolm Liung 30d, 7032 Laax GR 2
#> 230 1014 Lausanne
#> 1148 1200 Genève
#> 1149 1200 Genève
#> 1150 Chemin des pralets, 74100 Etrembières, 1200 Genève
#> 5531 1919 Martigny
#> 5532 1919 Martigny
#> 5533 1919 Martigny
#> 5534 1919 Martigny
#> 7694 2500 Biel/Bienne
#> 7695 2500 Biel/Bienne
#> 7696 2500 Biel/Bienne
#> 7697 2500 Biel/Bienne
#> 7698 2500 Biel/Bienne
#> 7699 2500 Biel/Bienne
#> 7700 2500 Biel/Bienne
#> 7701 2500 Biel/Bienne
#> 7702 2500 Biel/Bienne
#> 7703 2500 Biel/Bienne
#> 7704 2500 Biel/Bienne
#> 7705 Hohlenweg 11b, 2500 Biel/Bienne
#> 7706 2500 Bienne
#> 8402 3000 Bern
#> 8403 3000 Bern
#> 8404 3000 Bern
#> 8405 3000 Bern
#> 8406 3000 Bern
#> 8407 3000 Bern
#> 8408 3000 Bern
#> 8409 3000 Bern
#> 8410 3000 Bern
#> 10532 Lörrach Brombach Steinsack 6, 4000 Basel
#> 10533 4000 Basel
#> 10534 4000 Basel
#> 12482 5201 Brugg AG
#> 13343 in TRIENGEN, ca. 20 min. bei Luzern, 6000 Luzern
#> 14101 6511 Cadenazzo
#> 14377 Augio 1F, 6547 Augio
#> 14698 Via Bacilieri 2, 6602 Muralto
#> 14699 6602 Muralto
#> 14700 6602 Muralto
#> 14701 6602 Muralto
#> 14702 6604 Solduno
#> 14703 6604 Locarno
#> 14704 6604 Solduno
#> 16725 6901 Lugano
#> 16726 6901 Lugano
#> 16727 6903 Lugano
#> 16728 6907 MASSAGNO
#> 16729 6907 MASSAGNO
#> 16730 6911 Campione d'Italia
#> 16731 6911 Campione d'Italia
#> 16732 6911 Campione d'Italia
#> 16733 6911 Campione d'Italia
#> 18049 Inder Platenga 34, 7133 Obersaxen
#> 18058 7135 Fideris
#> 18323 8000 Zürich
#> 18324 8000 Zürich
#> 18325 8000 Zürich
#> 18326 8000 Zürich
#> 18327 8000 Zürich
#> 18328 8000 Zürich
#> 18329 8000 Zürich
#> 18330 8000 Zürich
#> 18331 8000 Zürich
#> 18332 8000 Zürich
#> 18816 Stemmerstrasse 14, 8238 Büsingen am Hochrhein
#> 19242 Chüngstrasse 48, 8423 Embrach
#> 19243 Chüngstrasse 60, 8423 Embrach
#> 20467 9241 Kradolf
#> 20468 9241 Kradolf
#> canton property_type floor year_category Community
#> 1 Vaud Villa 2006-2010 <NA>
#> 2 Vaud Single house 1919-1945 <NA>
#> 3 Vaud Apartment noteg 2016-2024 <NA>
#> 4 Vaud Villa 1961-1970 <NA>
#> 5 Grisons Apartment noteg 2016-2024 <NA>
#> 6 Grisons Apartment noteg 2011-2015 <NA>
#> 7 Grisons Apartment eg 2016-2024 <NA>
#> 230 Vaud Apartment eg 2011-2015 <NA>
#> 1148 Geneva Bifamiliar house 1981-1990 <NA>
#> 1149 Geneva Single house 2011-2015 <NA>
#> 1150 Geneva Bifamiliar house 2016-2024 <NA>
#> 5531 Valais Apartment noteg 2016-2024 <NA>
#> 5532 Valais Apartment noteg 2016-2024 <NA>
#> 5533 Valais Apartment noteg 2016-2024 <NA>
#> 5534 Valais Attic flat noteg 2016-2024 <NA>
#> 7694 Bern Apartment noteg 1971-1980 <NA>
#> 7695 Bern Single house 2016-2024 <NA>
#> 7696 Bern Single house 2016-2024 <NA>
#> 7697 Bern Single house 2016-2024 <NA>
#> 7698 Bern Single house 2016-2024 <NA>
#> 7699 Bern Single house 2016-2024 <NA>
#> 7700 Bern Single house 2016-2024 <NA>
#> 7701 Bern Single house 2016-2024 <NA>
#> 7702 Bern Single house 2001-2005 <NA>
#> 7703 Bern Villa 2016-2024 <NA>
#> 7704 Bern Villa 2016-2024 <NA>
#> 7705 Bern Single house 2001-2005 <NA>
#> 7706 Bern Single house 2016-2024 <NA>
#> 8402 Bern Apartment noteg 2016-2024 <NA>
#> 8403 Bern Apartment noteg 2016-2024 <NA>
#> 8404 Bern Apartment eg 2016-2024 <NA>
#> 8405 Bern Apartment noteg 2016-2024 <NA>
#> 8406 Bern Apartment eg 2016-2024 <NA>
#> 8407 Bern Roof flat noteg 2016-2024 <NA>
#> 8408 Bern Apartment eg 2016-2024 <NA>
#> 8409 Bern Apartment noteg 1991-2000 <NA>
#> 8410 Bern Duplex noteg 2016-2024 <NA>
#> 10532 Basel-Stadt Single house 1961-1970 <NA>
#> 10533 Basel-Stadt Single house 2016-2024 <NA>
#> 10534 Basel-Stadt Villa 2016-2024 <NA>
#> 12482 Aargau Apartment noteg 2016-2024 <NA>
#> 13343 Lucerne Apartment noteg 1991-2000 <NA>
#> 14101 Ticino Apartment noteg 2016-2024 <NA>
#> 14377 Grisons Single house 2016-2024 <NA>
#> 14698 Ticino Apartment noteg 1946-1960 <NA>
#> 14699 Ticino Apartment eg 1961-1970 <NA>
#> 14700 Ticino Single house 1981-1990 <NA>
#> 14701 Ticino Single house 1981-1990 <NA>
#> 14702 Ticino Attic flat noteg 2011-2015 <NA>
#> 14703 Ticino Apartment noteg 2011-2015 <NA>
#> 14704 Ticino Apartment noteg 2011-2015 <NA>
#> 16725 Ticino Attic flat noteg 2011-2015 <NA>
#> 16726 Ticino Apartment noteg 2011-2015 <NA>
#> 16727 Ticino Apartment noteg 2006-2010 <NA>
#> 16728 Ticino Apartment noteg 2016-2024 <NA>
#> 16729 Ticino Apartment noteg 2016-2024 <NA>
#> 16730 Ticino Apartment noteg 1991-2000 <NA>
#> 16731 Ticino Apartment eg 1946-1960 <NA>
#> 16732 Ticino Apartment noteg 1946-1960 <NA>
#> 16733 Ticino Single house 1971-1980 <NA>
#> 18049 Grisons Single house 2006-2010 <NA>
#> 18058 Grisons Single house 0-1919 <NA>
#> 18323 Zurich Apartment noteg 2016-2024 <NA>
#> 18324 Zurich Apartment noteg 2006-2010 <NA>
#> 18325 Zurich Attic flat noteg 2016-2024 <NA>
#> 18326 Zurich Apartment noteg 2016-2024 <NA>
#> 18327 Zurich Single house 2016-2024 <NA>
#> 18328 Zurich Apartment eg 2016-2024 <NA>
#> 18329 Zurich Apartment noteg 0-1919 <NA>
#> 18330 Zurich Apartment noteg 2016-2024 <NA>
#> 18331 Zurich Apartment noteg 2016-2024 <NA>
#> 18332 Zurich Attic flat noteg 2006-2010 <NA>
#> 18816 Schaffhausen Apartment noteg 1961-1970 <NA>
#> 19242 Zurich Single house 2016-2024 <NA>
#> 19243 Zurich Bifamiliar house 2016-2024 <NA>
#> 20467 Thurgau Apartment noteg 1991-2000 <NA>
#> 20468 Thurgau Apartment noteg 1991-2000 <NA>
#> Canton_code lon lat
#> 1 <NA> NA NA
#> 2 <NA> NA NA
#> 3 <NA> NA NA
#> 4 <NA> NA NA
#> 5 <NA> NA NA
#> 6 <NA> NA NA
#> 7 <NA> NA NA
#> 230 <NA> NA NA
#> 1148 <NA> NA NA
#> 1149 <NA> NA NA
#> 1150 <NA> NA NA
#> 5531 <NA> NA NA
#> 5532 <NA> NA NA
#> 5533 <NA> NA NA
#> 5534 <NA> NA NA
#> 7694 <NA> NA NA
#> 7695 <NA> NA NA
#> 7696 <NA> NA NA
#> 7697 <NA> NA NA
#> 7698 <NA> NA NA
#> 7699 <NA> NA NA
#> 7700 <NA> NA NA
#> 7701 <NA> NA NA
#> 7702 <NA> NA NA
#> 7703 <NA> NA NA
#> 7704 <NA> NA NA
#> 7705 <NA> NA NA
#> 7706 <NA> NA NA
#> 8402 <NA> NA NA
#> 8403 <NA> NA NA
#> 8404 <NA> NA NA
#> 8405 <NA> NA NA
#> 8406 <NA> NA NA
#> 8407 <NA> NA NA
#> 8408 <NA> NA NA
#> 8409 <NA> NA NA
#> 8410 <NA> NA NA
#> 10532 <NA> NA NA
#> 10533 <NA> NA NA
#> 10534 <NA> NA NA
#> 12482 <NA> NA NA
#> 13343 <NA> NA NA
#> 14101 <NA> NA NA
#> 14377 <NA> NA NA
#> 14698 <NA> NA NA
#> 14699 <NA> NA NA
#> 14700 <NA> NA NA
#> 14701 <NA> NA NA
#> 14702 <NA> NA NA
#> 14703 <NA> NA NA
#> 14704 <NA> NA NA
#> 16725 <NA> NA NA
#> 16726 <NA> NA NA
#> 16727 <NA> NA NA
#> 16728 <NA> NA NA
#> 16729 <NA> NA NA
#> 16730 <NA> NA NA
#> 16731 <NA> NA NA
#> 16732 <NA> NA NA
#> 16733 <NA> NA NA
#> 18049 <NA> NA NA
#> 18058 <NA> NA NA
#> 18323 <NA> NA NA
#> 18324 <NA> NA NA
#> 18325 <NA> NA NA
#> 18326 <NA> NA NA
#> 18327 <NA> NA NA
#> 18328 <NA> NA NA
#> 18329 <NA> NA NA
#> 18330 <NA> NA NA
#> 18331 <NA> NA NA
#> 18332 <NA> NA NA
#> 18816 <NA> NA NA
#> 19242 <NA> NA NA
#> 19243 <NA> NA NA
#> 20467 <NA> NA NA
#> 20468 <NA> NA NAWe have 77 NAN, where
Removed them ::: {.cell layout-align=“center”}
#remove the rows with nan in city
properties_filtered <- df[!is.na(df$Community),]
reactable(head(properties_filtered, 100)):::
# read csv
impots <- read.csv(file.path(here(),"data/estv_income_rates.csv"), sep = ",", header = TRUE, stringsAsFactors = FALSE)
# Remove 1st row
impots <- impots[-1, ]
# Remove 3rd column
impots <- impots[, -3]
# Combine text for columns 4-8
impots[1, 4:8] <- "Impôt sur le revenu"
# Combine text for columns 9-13
impots[1, 9:13] <- "Impôt sur la fortune"
# Combine text for columns 14-16
impots[1, 14:16] <- "Impôt sur le bénéfice"
# Combine text for columns 17-19
impots[1, 17:19] <- "Impôt sur le capital"
# Combine content of the first 2 rows into the 2nd row
impots[2, ] <- apply(impots[1:2, ], 2, function(x) paste(ifelse(is.na(x[1]), x[2], ifelse(is.na(x[2]), x[1], paste(x[1], x[2], sep = " ")))))
# Remove 1st row
impots <- impots[-1, ]
# Assign the text to the 1st row and 1st column
impots[1, 1] <- "Coefficient d'impôt en %"
# Replace column names with the content of the first row
colnames(impots) <- impots[1, ]
impots <- impots[-1, ]
# Check for missing values in impots
any_missing <- any(is.na(impots))
if (any_missing) {
print("There are missing values in impots.")
} else {
print("There are no missing values in impots.")
}
#> [1] "There are no missing values in impots."
# Replace row names with the content of the 3rd column
row.names(impots) <- impots[, 3]
impots <- impots[, -3]
# Remove 2nd column (to avoid canton column)
impots <- impots[, -2]
# Remove impot eglise
impots <- impots[, -c(4:6)]
impots <- impots[, -c(6:8)]
impots <- impots[, -8]
impots <- impots[, -10]
# Clean data and convert to numeric
cleaned_impots <- apply(impots, 2, function(x) as.numeric(gsub("[^0-9.-]", "", x)))
# Replace NA values with 0
cleaned_impots[is.na(cleaned_impots)] <- 0
# Check for non-numeric values
non_numeric <- sum(!is.na(cleaned_impots) & !is.numeric(cleaned_impots))
if (non_numeric > 0) {
print(paste("Warning: Found", non_numeric, "non-numeric values."))
}
rownames(cleaned_impots) <- rownames(impots)
#reactable(head(cleaned_impots, 100))Replaces NAs in both Taux de couverture social and Political (Conseil National Datas) For Taux de couverture Social: NAs were due to reason “Q” = “Not indicated to protect confidentiality” We replaced the NAs by the average taux de couverture in Switzerland in 2019, which was 3.2%
For Political data: NAs were due to reason “M” = “Not indicated because data was not important or applicable” Therefore, we replaced the NAs by 0
# il faudra changer le path
commune_prep <- read.csv(file.path(here(),"data/commune_data.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)
# We keep only 2019 to have some reference? (2020 is apparently not really complete)
commune_2019 <- subset(commune_prep, PERIOD_REF == "2019") %>%
select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))
# delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonne
commune_2019 <- subset(commune_2019, STATUS == "A") %>%
select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))
# on enlève les lignes qui sont des aggrégats
commune_2019 <- subset(commune_2019, REGION != "Schweiz")
commune_2019 <- commune_2019 %>%
pivot_wider(names_from = INDICATORS, values_from = VALUE)
# Rename columns using the provided map
commune <- commune_2019 %>%
rename(`Population - Habitants` = Ind_01_01,
`Population - Densité de la population` = Ind_01_03,
`Population - Etrangers` = Ind_01_08,
`Population - Part du groupe d'âge 0-19 ans` = Ind_01_04,
`Population - Part du groupe d'âge 20-64 ans` = Ind_01_05,
`Population - Part du groupe d'âge 65+ ans` = Ind_01_06,
`Population - Taux brut de nuptialité` = Ind_01_09,
`Population - Taux brut de divortialité` = Ind_01_10,
`Population - Taux brut de natalité` = Ind_01_11,
`Population - Taux brut de mortalité` = Ind_01_12,
`Population - Ménages privés` = Ind_01_13,
`Population - Taille moyenne des ménages` = Ind_01_14,
`Sécurité sociale - Taux d'aide sociale` = Ind_11_01,
`Conseil national - PLR` = Ind_14_01,
`Conseil national - PDC` = Ind_14_02,
`Conseil national - PS` = Ind_14_03,
`Conseil national - UDC` = Ind_14_04,
`Conseil national - PEV/PCS` = Ind_14_05,
`Conseil national - PVL` = Ind_14_06,
`Conseil national - PBD` = Ind_14_07,
`Conseil national - PST/Sol.` = Ind_14_08,
`Conseil national - PES` = Ind_14_09,
`Conseil national - Petits partis de droite` = Ind_14_10)
# If no one voted for a party, set as NA -> replacing it with 0 instead
commune <- commune %>%
mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))
# Removing NAs from Taux de couverture sociale column
# Setting the mean as the mean for Switzerland in 2019 (3.2%)
mean_taux_aide_social <- 3.2
# Replace NA values with the mean
commune <- commune %>%
mutate(`Sécurité sociale - Taux d'aide sociale` = if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))
#show 100 first rows of commune using reactable
reactable(head(commune, 100))
# commune_prep <- read.csv(file.path(here(),"data/commune_data.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)
#
# # We keep only 2019 to have some reference? (2020 is apparently not really complete)
# commune_2019 <- subset(commune_prep, PERIOD_REF == "2019") %>%
# select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))
#
# # delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonne
# commune_2019 <- subset(commune_2019, STATUS == "A") %>%
# select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))
#
# # on enlève les lignes qui sont des aggrégats
# commune_2019 <- subset(commune_2019, REGION != "Schweiz")
#
# commune_2019 <- commune_2019 %>%
# pivot_wider(names_from = INDICATORS, values_from = VALUE)
#
# # Rename columns using the provided map
# commune <- commune_2019 %>%
# rename(`Population - Habitants` = Ind_01_01,
# `Population - Densité de la population` = Ind_01_03,
# `Population - Etrangers` = Ind_01_08,
# `Population - Part du groupe d'âge 0-19 ans` = Ind_01_04,
# `Population - Part du groupe d'âge 20-64 ans` = Ind_01_05,
# `Population - Part du groupe d'âge 65+ ans` = Ind_01_06,
# `Population - Taux brut de nuptialité` = Ind_01_09,
# `Population - Taux brut de divortialité` = Ind_01_10,
# `Population - Taux brut de natalité` = Ind_01_11,
# `Population - Taux brut de mortalité` = Ind_01_12,
# `Population - Ménages privés` = Ind_01_13,
# `Population - Taille moyenne des ménages` = Ind_01_14,
# `Sécurité sociale - Taux d'aide sociale` = Ind_11_01,
# `Conseil national - PLR` = Ind_14_01,
# `Conseil national - PDC` = Ind_14_02,
# `Conseil national - PS` = Ind_14_03,
# `Conseil national - UDC` = Ind_14_04,
# `Conseil national - PEV/PCS` = Ind_14_05,
# `Conseil national - PVL` = Ind_14_06,
# `Conseil national - PBD` = Ind_14_07,
# `Conseil national - PST/Sol.` = Ind_14_08,
# `Conseil national - PES` = Ind_14_09,
# `Conseil national - Petits partis de droite` = Ind_14_10)
#
# # If no one voted for a party, set as NA -> replacing it with 0 instead
# commune <- commune %>%
# mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))
#
#
# # Removing NAs from Taux de couverture sociale column
# # Setting the mean as the mean for Switzerland in 2019 (3.2%)
# mean_taux_aide_social <- 3.2
#
# # Replace NA values with the mean
# commune <- commune %>%
# mutate(`Sécurité sociale - Taux d'aide sociale` = if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))
# Trying to Cluster commune datas to: 1. Reduce dimension 2. See similarities
A regarder, est-ce qu’on fait un cluster pour les datas politques + un cluster pour les data démographiques, ou est-ce qu’on regroupe tout?
set.seed(123)
# Clustering demographic
cols_commune_demographic <- select(commune, -c("REGION", "CODE_REGION","Conseil national - PLR","Conseil national - PDC", "Conseil national - PS", "Conseil national - UDC", "Conseil national - PEV/PCS", "Conseil national - PVL", "Conseil national - PBD", "Conseil national - PST/Sol.", "Conseil national - PES", "Conseil national - Petits partis de droite"))
# Scale the columns, some are total numbers, some are percentages
cols_commune_demographic <- scale(cols_commune_demographic)
# Calculate the distance matrix
dist_matrix_demographic <- dist(cols_commune_demographic, method = "minkowski")
# Perform hierarchical clustering
hclust_model_demographic <- hclust(dist_matrix_demographic, method = "ward.D")
# Create dendrogram
dend_demo <- as.dendrogram(hclust_model_demographic)
dend_demo <- color_branches(dend_demo, k = 5) #Set number of cluster to 5, to keep the same scale for all our variables
plot(dend_demo, main = "Demographics - Hierarchical Clustering Dendrogram")# Clustering politics
set.seed(123)
cols_commune_politics <- select(commune, c("Conseil national - PLR","Conseil national - PDC", "Conseil national - PS", "Conseil national - UDC", "Conseil national - PEV/PCS", "Conseil national - PVL", "Conseil national - PBD", "Conseil national - PST/Sol.", "Conseil national - PES", "Conseil national - Petits partis de droite"))
# Scale the columns, some are total numbers, some are percentages
cols_commune_politics <- scale(cols_commune_politics)
# Calculate the distance matrix
dist_matrix_politics <- dist(cols_commune_politics, method = "minkowski")
# Perform hierarchical clustering
hclust_model_politics <- hclust(dist_matrix_politics, method = "ward.D")
# Create dendrogram
dend_pol <- as.dendrogram(hclust_model_politics)
dend_pol <- color_branches(dend_pol, k = 5) #Set number of cluster to 5, to keep the same scale for all our variables
plot(dend_pol, main = "Politics - Hierarchical Clustering Dendrogram")To prevent introducing 10 new types of taxes, we conducted a clustering analysis on the tax dataset to identify which municipalities can be grouped together. Based on the within-cluster sum of squares, we found 5 clusters. These 5 distinct clusters will be assigned to properties to determine which municipalities are subject to a particular type of tax. ## Tax ::: {.cell layout-align=“center”}
set.seed(123)
# Clean data and convert to numeric
cleaned_impots <- apply(impots, 2, function(x) as.numeric(gsub("[^0-9.-]", "", x)))
cleaned_impots[is.na(cleaned_impots)] <- 0 # Replace NA values with 0
# Scale the features
scaled_impots <- scale(cleaned_impots)
# Perform k-means clustering
k <- 2 # Initial guess for the number of clusters
kmeans_model <- kmeans(scaled_impots, centers = k)
# Check within-cluster sum of squares (elbow method)
wss <- numeric(10)
for (i in 1:10) {
kmeans_model <- kmeans(scaled_impots, centers = i)
wss[i] <- sum(kmeans_model$withinss)
}
#plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")
# Adjust k based on elbow method
k <- 5
# Perform k-means clustering again with optimal k
kmeans_model <- kmeans(scaled_impots, centers = k)
# Assign cluster labels to dendrogram
clusters <- kmeans_model$cluster
# Plot dendrogram
#colored_dend <- color_branches(dend, k = 5)
#y_zoom_range <- c(0, 80) # Adjust the y-axis range as needed
#plot(colored_dend, main = "Hierarchical Clustering Dendrogram", horiz = FALSE, ylim = y_zoom_range):::
# Get the cluster centers
cluster_centers <- kmeans_model$centers
# Create a data frame with cluster centers
cluster_centers_df <- data.frame(cluster = 1:k, cluster_centers)
# Print cluster centers
print(cluster_centers_df)
#> cluster Coefficient.d.impôt.en.. Impôt.sur.le.revenu.Canton
#> 1 1 0.400 -0.611
#> 2 2 0.786 -0.390
#> 3 3 -0.318 -0.266
#> 4 4 -0.792 -0.699
#> 5 5 -0.941 1.932
#> Impôt.sur.le.revenu.Commune Impôt.sur.la.fortune.Canton
#> 1 -0.0839 -0.615
#> 2 -0.5732 -0.394
#> 3 0.9387 -0.270
#> 4 -0.3119 -0.690
#> 5 1.3332 1.933
#> Impôt.sur.la.fortune.Commune Impôt.sur.le.bénéfice.Canton
#> 1 -0.0849 1.951
#> 2 -0.5725 -0.249
#> 3 0.9383 -0.869
#> 4 -0.3128 -0.709
#> 5 1.3330 1.402
#> Impôt.sur.le.bénéfice.Commune Impôt.sur.le.capital.Canton
#> 1 -1.3419 1.879
#> 2 -0.6444 -0.276
#> 3 1.9896 -0.874
#> 4 0.0565 -0.718
#> 5 0.8732 1.492
#> Impôt.sur.le.capital.Commune
#> 1 -1.30508
#> 2 -0.65169
#> 3 1.81595
#> 4 0.00514
#> 5 1.01512
# Calculate the size of each cluster
cluster_sizes <- table(kmeans_model$cluster)
# Print cluster sizes
print(cluster_sizes)
#>
#> 1 2 3 4 5
#> 75 999 178 462 417
# Get the cluster labels
cluster_labels <- kmeans_model$cluster
# Convert cleaned_impots to a data frame
impots_cluster <- as.data.frame(cleaned_impots)
# Add the cluster labels to cleaned_impots
impots_cluster$cluster <- cluster_labels
rownames(impots_cluster) <- rownames(impots)
impots_cluster <- impots_cluster %>%
rownames_to_column(var = "Community")# Preparing df_commune for merging with main dataset
df_commune <- select(commune, REGION)
df_commune$Demographic_cluster <- cutree(hclust_model_demographic, k = 5)
df_commune$Political_cluster <- cutree(hclust_model_politics, k = 5)
# Preparing to merge
merging <- inner_join(amto_df, df_commune, by = c("Community" = "REGION"))
impots_cluster_subset <- impots_cluster[, c("Community", "cluster")]
merging <- merging %>%
left_join(impots_cluster_subset, by = "Community")
clusters_df <- merging %>%
rename(Tax_cluster = cluster) %>%
rename(Commune = Community)
clusters_df <- clusters_df %>%
select(c("Commune", "zip_code", "Canton_code", "Demographic_cluster", "Political_cluster", "Tax_cluster"))
# Only NAs are for commune Brugg, (written Brugg (AG) in the other data set) -> j'entre le cluster à la mano
clusters_df$Tax_cluster[is.na(clusters_df$Tax_cluster)] <- 2
# adding it to our main data set:
properties_filtered <- merge(properties_filtered, clusters_df[, c("zip_code", "Demographic_cluster", "Political_cluster", "Tax_cluster")], by = "zip_code", all.x = TRUE)
na_count <- sum(is.na(properties_filtered[, c("Demographic_cluster", "Political_cluster", "Tax_cluster")]))
# Print the result
if (na_count > 0) {
print("There are NA values in the merged dataframe.")
print(na_count)
} else {
print("There are no NA values in the merged dataframe.")
}
#> [1] "There are NA values in the merged dataframe."
#> [1] 684
# Find rows with NA values in the specified columns
na_rows <- subset(properties_filtered, is.na(Demographic_cluster) | is.na(Political_cluster) | is.na(Tax_cluster))# Create a leaflet map with optimized markers
map <- leaflet(properties_filtered) %>%
addTiles() %>% # Add default OpenStreetMap tiles
addProviderTiles(providers$Esri.NatGeoWorldMap) %>% # Add topographic maps for context
addCircleMarkers(
~lon, ~lat,
radius = 1.5, # Smaller radius for the circle markers
color = "#32012F", # Specifying a color for the markers
fillOpacity = 0.2, # Semi-transparent fill
stroke = FALSE, # No border to the circle markers to reduce visual noise
popup = ~paste("Price: ", price, "<br>",
"Rooms: ", number_of_rooms, "<br>",
"Type: ", property_type, "<br>",
"Year: ", year_category),
label = ~paste("Price: ", price) # Tooltip on hover
) %>% addLegend(
position = "bottomright", # Position the legend at the bottom right
colors = "#32012F", # Use the same color as the markers
labels = "Properties" # Label for the legend
)
map$width <- "100%" # Set the width of the map to 100%
map$height <- 600 # Set the height of the map to 600 pixels
maphistogram_price <- ggplot(properties_filtered, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "red") +
labs(title = "Distribution of Prices",
x = "Price",
y = "Frequency") +
theme_minimal()
# Convert ggplot object to plotly object
interactive_histogram_price <- ggplotly(histogram_price, width = 600, height = 400 )
# Display the interactive histogram
interactive_histogram_pricenote : only price between 0 and 500000 so some outliers aren’t here
# Create the ggplot object
histogram <- ggplot(properties_filtered, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
facet_wrap(~ property_type, scales = "free", ncol = 2) +
labs(title = "Distribution of Prices by Property Type",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim(0, 5000000)
# Convert ggplot object to plotly object
interactive_histogram <- ggplotly(histogram, width = 600, height = 1000)
# Display the interactive plot
interactive_histogramnote : only price between 0 and 500000 so some outliers aren’t here
# Create a histogram of prices for each year category
histogram <- ggplot(properties_filtered, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
facet_wrap(~ year_category, scales = "free", ncol = 2) +
labs(title = "Distribution of Prices by Year Category",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim(0, 5000000)
# Convert ggplot object to plotly object
interactive_histogram_year <- ggplotly(histogram, width = 600, height = 1000)
# Display the interactive plot
interactive_histogram_yearnote : only price between 0 and 500000 so some outliers aren’t here
histogram <- ggplot(properties_filtered, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
facet_wrap(~ canton, scales = "free", ncol = 2) +
labs(title = "Distribution of Prices by Canton for properties between 0 and 5 million",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim(0, 5000000)
# Convert ggplot object to plotly object with adjusted height
interactive_histogram <- ggplotly(histogram, width = 600, height = 1000) %>%
layout(height = 1000) # Adjust the height as needed
# Display the interactive plot
interactive_histogramhistogram <- ggplot(properties_filtered, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
facet_wrap(~ canton, scales = "free", ncol = 2) +
labs(title = "Distribution of Prices by Canton for properties between 5 million and 10 million",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim( 5000000,10000000)
# Convert ggplot object to plotly object with adjusted height
interactive_histogram <- ggplotly(histogram, width = 600, height = 1000) %>%
layout(height = 1000) # Adjust the height as needed
# Display the interactive plot
interactive_histogramnote : only price between 0 and 500000 so some outliers aren’t here
# Create a histogram of prices for each number of rooms
histogram <- ggplot(properties_filtered, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
facet_wrap(~ number_of_rooms, scales = "free", ncol = 2) +
labs(title = "Distribution of Prices by Number of Rooms",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim(0, 5000000)
# Convert ggplot object to plotly object with adjusted height
interactive_histogram <- ggplotly(histogram, width = 600, height = 1000)%>%
layout(height = 2000)
# Display the interactive plot
interactive_histogramhistogram <- ggplot(properties_filtered, aes(x = square_meters)) +
geom_histogram(binwidth = 15, fill = "skyblue", color = "black") +
labs(title = "Distribution of Properties by Square Meters",
x = "Square Meters",
y = "Frequency") +
theme_minimal() +
xlim(0, 2000)
# Convert ggplot object to plotly object with adjusted height
interactive_histogram <- ggplotly(histogram, width = NULL, height = NULL) # Adjust width and height as needed
#> Warning: Removed 2 rows containing non-finite outside the scale range
#> (`stat_bin()`).
# Display the interactive plot
interactive_histogram# Create the boxplot
boxplot <- ggplot(properties_filtered, aes(x = as.factor(Tax_cluster), y = price)) +
geom_boxplot(fill = "skyblue", color = "black") +
labs(title = "Boxplot of Property Prices by Tax Cluster",
x = "Tax Cluster",
y = "Price") +
theme_minimal() +
ylim(100000, 400000)
# Convert ggplot object to plotly object
interactive_boxplot <- ggplotly(boxplot)
interactive_boxplotimpot_cols <- names(properties_filtered)[startsWith(names(properties_filtered), "Impôt")]
# Count the number of NA values in selected columns
na_counts <- colSums(is.na(properties_filtered[impot_cols]))
# Print the counts
print(na_counts)
#> numeric(0)